#!/usr/bin/env bash

# Slurm trigger script for nodes in failing states
# Homepage: https://github.com/OleHolmNielsen/Slurm_tools/

# MUST be executed by the slurm user, consistent with:
# scontrol show config | grep SlurmUser

# Configure these variables for your system:
slurm_user=slurm
slurm_notify=root
my_mail=/usr/bin/mailx

# Select the node state as one of: down,drained,fail,up (see "man strigger")
my_state=drained

# sacct output fields
export SACCT_FORMAT="JobID,JobName,Partition,User,Account,NodeList,State,ExitCode"

# This will make the trigger permanent:
# my_flags="--flags=PERM"

# Check that we run as the slurm user
if test "`id -nu`" != "$slurm_user"
then
	echo ERROR: The strigger command must be executed by the $slurm_user user
	exit -1
fi

# Commands to be run for notification
function my_tasks() {
	# Print node info
	sinfo -lRN
	# Select Nodelist with Reason="Kill task failed" and reboot those nodes
	# (the paste command is from the coreutils package)
	nodelist=`sinfo -h -t $my_state -o "%N %E" | grep -i "Kill task failed" | awk '{print $1}' | paste -d ,`
	if [[ -n "$nodelist" ]]
	then
		echo
		echo "Recent jobs running on nodes $nodelist"
		sacct --nodelist=$nodelist -S now-600
		echo
		echo "Now reboot and resume nodes $nodelist"
		scontrol reboot nextstate=resume $nodelist
		sinfo -N -n $nodelist
	fi
	# Submit trigger for next event ($0 = this script)
	echo
	echo Setting new trigger --node --$my_state --program=$0
	strigger --set --node --$my_state --program=$0 $my_flags
}

# Notify Slurm administrator of node state
my_tasks 2>&1 | $my_mail -s "Nodes $my_state: $@" $slurm_notify
